{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "1QiCFLer1FIe" }, "source": [ "**Lab 11 – Dimensional reduction and clustering**" ] }, { "cell_type": "markdown", "metadata": { "id": "vCyq3-8y1FIj" }, "source": [ "_This notebook contains the sample from https://www.kaggle.com/learn/feature-engineering, https://inria.github.io/scikit-learn-mooc/python_scripts/dev_features_importance.html#, https://scikit-learn.org/stable/modules/feature_selection.html, https://scikit-learn.org/stable/modules/preprocessing.html#, https://scikit-learn.org/stable/modules/unsupervised_reduction.html and https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb" ] }, { "cell_type": "markdown", "metadata": { "id": "9J5g6PDs1FIk" }, "source": [ "\n", " \n", " \n", "
\n", " \"Open\n", " \n", " \n", "
" ] }, { "cell_type": "code", "source": [ "!pip install Boruta\n", "!pip install opentsne\n", "!pip install umap-learn" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TTpvSn-KRgu_", "outputId": "54330545-d536-4a9d-dbf7-dc0c67f90e39" }, "execution_count": 108, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: Boruta in /usr/local/lib/python3.7/dist-packages (0.3)\n", "Requirement already satisfied: scikit-learn>=0.17.1 in /usr/local/lib/python3.7/dist-packages (from Boruta) (1.0.2)\n", "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from Boruta) (1.4.1)\n", "Requirement already satisfied: numpy>=1.10.4 in /usr/local/lib/python3.7/dist-packages (from Boruta) (1.21.6)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.17.1->Boruta) (1.1.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.17.1->Boruta) (3.1.0)\n", "Requirement already satisfied: opentsne in /usr/local/lib/python3.7/dist-packages (0.6.2)\n", "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.7/dist-packages (from opentsne) (1.21.6)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from opentsne) (1.4.1)\n", "Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.7/dist-packages (from opentsne) (1.0.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20->opentsne) (3.1.0)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20->opentsne) (1.1.0)\n", "Collecting umap-learn\n", " Downloading umap-learn-0.5.3.tar.gz (88 kB)\n", "\u001b[K |████████████████████████████████| 88 kB 4.7 MB/s \n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (1.21.6)\n", "Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (1.0.2)\n", "Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (1.4.1)\n", "Requirement already satisfied: numba>=0.49 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (0.51.2)\n", "Collecting pynndescent>=0.5\n", " Downloading pynndescent-0.5.6.tar.gz (1.1 MB)\n", "\u001b[K |████████████████████████████████| 1.1 MB 43.0 MB/s \n", "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from umap-learn) (4.64.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from numba>=0.49->umap-learn) (57.4.0)\n", "Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /usr/local/lib/python3.7/dist-packages (from numba>=0.49->umap-learn) (0.34.0)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from pynndescent>=0.5->umap-learn) (1.1.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.22->umap-learn) (3.1.0)\n", "Building wheels for collected packages: umap-learn, pynndescent\n", " Building wheel for umap-learn (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=da74e5543b87d2622e3b52ec5d22885cca380304f0cab864c22ea213ca2772cf\n", " Stored in directory: /root/.cache/pip/wheels/b3/52/a5/1fd9e3e76a7ab34f134c07469cd6f16e27ef3a37aeff1fe821\n", " Building wheel for pynndescent (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for pynndescent: filename=pynndescent-0.5.6-py3-none-any.whl size=53943 sha256=e3b27cfabfad50b66be2d564d14ffdb820563b3bbff57d51323d3e825e997ae1\n", " Stored in directory: /root/.cache/pip/wheels/03/f1/56/f80d72741e400345b5a5b50ec3d929aca581bf45e0225d5c50\n", "Successfully built umap-learn pynndescent\n", "Installing collected packages: pynndescent, umap-learn\n", "Successfully installed pynndescent-0.5.6 umap-learn-0.5.3\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "\n", "from sklearn.datasets import load_iris\n", "from sklearn.datasets import load_digits\n", "from sklearn.datasets import fetch_california_housing\n", "from sklearn.datasets import make_classification\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import make_pipeline, Pipeline\n", "\n", "from sklearn.feature_selection import VarianceThreshold\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import chi2\n", "from sklearn.feature_selection import f_classif\n", "from sklearn.feature_selection import mutual_info_regression\n", "from sklearn.feature_selection import SequentialFeatureSelector\n", "from sklearn.feature_selection import SelectFromModel\n", "from sklearn.inspection import permutation_importance\n", "\n", "from sklearn.model_selection import cross_validate\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import RepeatedKFold\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import RidgeCV\n", "from sklearn.linear_model import Lasso\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import LinearSVC\n", "from sklearn.svm import SVC\n", "\n", "from boruta import BorutaPy\n", "\n", "from sklearn.decomposition import PCA\n", "from openTSNE import TSNE as oTSNE\n", "import umap\n", "\n", "from sklearn.cluster import KMeans\n", "from sklearn.cluster import FeatureAgglomeration\n", "\n", "import matplotlib as mpl\n", "from matplotlib import pyplot as plt\n", "%matplotlib inline" ], "metadata": { "id": "5bV_HvPiH-9i" }, "execution_count": 181, "outputs": [] }, { "cell_type": "code", "source": [ "# Upload the API’s key JSON file to your Colab\n", "# session by running the following code in a notebook cell:\n", "from google.colab import files\n", "files.upload()" ], "metadata": { "colab": { "resources": { "http://localhost:8080/nbextensions/google.colab/files.js": { "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", "ok": true, "headers": [ [ "content-type", "application/javascript" ] ], "status": 200, "status_text": "" } }, "base_uri": "https://localhost:8080/", "height": 92 }, "id": "wGIhk6PolW1K", "outputId": "05e411b9-9352-4293-f223-58fb05ee57cd" }, "execution_count": 20, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving kaggle.json to kaggle.json\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "{'kaggle.json': b'{\"username\":\"phonchi\",\"key\":\"543eb33eabf413fb77a6b374f96ccfca\"}'}" ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "!mkdir ~/.kaggle\n", "!cp kaggle.json ~/.kaggle/\n", "!chmod 600 ~/.kaggle/kaggle.json" ], "metadata": { "id": "pOIWaP0RlcFv" }, "execution_count": 21, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Feature selection" ], "metadata": { "id": "VcdwxGJwV8vU" } }, { "cell_type": "markdown", "source": [ "The classes in the `sklearn.feature_selection` module can be used for feature selection/dimensionality reduction on sample sets, either to improve estimators’ accuracy scores or to boost their performance on very high-dimensional datasets." ], "metadata": { "id": "Uf513TCeWSX0" } }, { "cell_type": "markdown", "source": [ "### Removing low variance features" ], "metadata": { "id": "RCajAbcsWdxE" } }, { "cell_type": "markdown", "source": [ "Suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples. Boolean features are Bernoulli random variables, and the variance of such variables is `0.8 * (1 - 0.8)`" ], "metadata": { "id": "pCPKnW7rWhfV" } }, { "cell_type": "code", "source": [ "X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]" ], "metadata": { "id": "S10Rp8tJWCH2" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n", "sel.fit_transform(X)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XPhKdizqcvtk", "outputId": "ec4b5e4e-a06a-49a0-91bc-e444801c8d09" }, "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0, 1],\n", " [1, 0],\n", " [0, 0],\n", " [1, 1],\n", " [1, 0],\n", " [1, 1]])" ] }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "markdown", "source": [ "As expected, `VarianceThreshold` has removed the first column, which has a probability of containing a zero." ], "metadata": { "id": "JLZi5CgFczbS" } }, { "cell_type": "markdown", "source": [ "### Univariate feature selection" ], "metadata": { "id": "ymMz_smRc8N6" } }, { "cell_type": "markdown", "source": [ "`Scikit-learn` exposes feature selection routines as objects that implement the transform method. For instance, we can perform a $\\chi^2$ test to the samples to retrieve only the two best features as follows:" ], "metadata": { "id": "OsocPngMc9hZ" } }, { "cell_type": "code", "source": [ "X, y = load_iris(return_X_y=True)\n", "X.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "T73LeQPxcytF", "outputId": "8bc1762e-26b9-4bbc-92e1-b64965f442fe" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(150, 4)" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "X_new = SelectKBest(chi2, k=2).fit_transform(X, y)\n", "X_new.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "U---Mg1DesiC", "outputId": "6f3086d8-4882-42ff-af90-80fed76e0574" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(150, 2)" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "markdown", "source": [ "These objects take as input a scoring function that returns univariate scores and p-values (or only scores for `SelectKBest` and `SelectPercentile`):\n", "\n", "* For regression: `f_regression`, `mutual_info_regression`\n", "* For classification: `chi2`, `f_classif`, `mutual_info_classif`" ], "metadata": { "id": "Rh4TnGG4e2yo" } }, { "cell_type": "markdown", "source": [ "The methods based on F-test estimate the degree of linear dependency between two random variables. For example, we create a dataset with two informative features among a hundred. To simplify our example, we do not include either redundant or repeated features." ], "metadata": { "id": "QxVJKlWDe-9Z" } }, { "cell_type": "code", "source": [ "data, target = make_classification(\n", " n_samples=5000,\n", " n_features=100,\n", " n_informative=2,\n", " n_redundant=0,\n", " n_repeated=0,\n", " random_state=0,\n", ")" ], "metadata": { "id": "7i4IUKasewA6" }, "execution_count": 9, "outputs": [] }, { "cell_type": "markdown", "source": [ "We will create two machine learning pipelines. The former will be a random forest that will use all available features. The latter will also be a random forest, but we will add a feature selection step to train this classifier. " ], "metadata": { "id": "JiKLn-26fiJw" } }, { "cell_type": "code", "source": [ "# Let’s create the model without any feature selection\n", "model_without_selection = RandomForestClassifier(n_jobs=2)" ], "metadata": { "id": "gbat_wsOfakf" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "# Then, let’s create a pipeline where the first stage will make the feature selection processing.\n", "model_with_selection = make_pipeline(\n", " SelectKBest(score_func=f_classif, k=2),\n", " RandomForestClassifier(n_jobs=2),\n", ")" ], "metadata": { "id": "fwxayZ8Sfyfx" }, "execution_count": 12, "outputs": [] }, { "cell_type": "markdown", "source": [ "We will measure the average time spent to train each pipeline and make it predict. Besides, we will compute the testing score of the model. We will collect these results via cross-validation." ], "metadata": { "id": "2vWsIOdMf8sS" } }, { "cell_type": "code", "source": [ "# Let’s start with the random forest without feature selection. We will store the results into a dataframe.\n", "cv_results_without_selection = cross_validate(model_without_selection, data, target)\n", "cv_results_without_selection = pd.DataFrame(cv_results_without_selection)" ], "metadata": { "id": "-bgBAgvPf6QA" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "# Now, we will repeat the process for the pipeline incorporating the feature selection.\n", "cv_results_with_selection = cross_validate(\n", " model_with_selection, data, target, return_estimator=True)\n", "cv_results_with_selection = pd.DataFrame(cv_results_with_selection)" ], "metadata": { "id": "No1nRqWigBE0" }, "execution_count": 14, "outputs": [] }, { "cell_type": "markdown", "source": [ "To analyze the results, we will merge the results from the two pipeline in a single pandas dataframe." ], "metadata": { "id": "hJUbhzd6gJtp" } }, { "cell_type": "code", "source": [ "cv_results = pd.concat(\n", " [cv_results_without_selection, cv_results_with_selection],\n", " axis=1,\n", " keys=[\"Without feature selection\", \"With feature selection\"],\n", ")\n", "# swap the level of the multi-index of the columns\n", "cv_results = cv_results.swaplevel(axis=\"columns\")" ], "metadata": { "id": "NwKMH8AlgIz0" }, "execution_count": 15, "outputs": [] }, { "cell_type": "markdown", "source": [ "Let’s first analyze the train and score time for each pipeline." ], "metadata": { "id": "APuAuoLQgNgh" } }, { "cell_type": "code", "source": [ "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", "cv_results[\"fit_time\"].plot.box(color=color, vert=False)\n", "plt.xlabel(\"Elapsed time (s)\")\n", "_ = plt.title(\"Time to fit the model\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 295 }, "id": "NyjNNnEtgM77", "outputId": "05459d87-12c9-4623-9de8-09998b41be53" }, "execution_count": 16, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "cv_results[\"score_time\"].plot.box(color=color, vert=False)\n", "plt.xlabel(\"Elapsed time (s)\")\n", "_ = plt.title(\"Time to make prediction\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 295 }, "id": "IHk6it43gSKM", "outputId": "10be241a-3559-41da-965d-ae378cb5f852" }, "execution_count": 17, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "We can draw the same conclusions for both training and scoring elapsed time: selecting the most informative features speed-up our pipeline. Of course, such speed-up is beneficial only if the generalization performance in terms of metrics remain the same. Let’s check the testing score." ], "metadata": { "id": "5tHmNZ85gmPa" } }, { "cell_type": "code", "source": [ "cv_results[\"test_score\"].plot.box(color=color, vert=False)\n", "plt.xlabel(\"Accuracy score\")\n", "_ = plt.title(\"Test score via cross-validation\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 295 }, "id": "4onQdLXGgfJn", "outputId": "3ff34056-5a48-4bb1-abdb-ab2b5ada7f8c" }, "execution_count": 18, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "We can observe that the model’s generalization performance selecting a subset of features decreases compared with the model using all available features. Since we generated the dataset, we can infer that the decrease is because of the selection. The feature selection algorithm did not choose the two informative features." ], "metadata": { "id": "n_Ym3R87g4vw" } }, { "cell_type": "code", "source": [ "for idx, pipeline in enumerate(cv_results_with_selection[\"estimator\"]):\n", " print(\n", " f\"Fold #{idx} - features selected are: \"\n", " f\"{np.argsort(pipeline[0].scores_)[-2:]}\"\n", " )" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mTUZGkUPgowf", "outputId": "87061117-1f82-4759-ac85-ef5ec8385e56" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fold #0 - features selected are: [89 53]\n", "Fold #1 - features selected are: [29 53]\n", "Fold #2 - features selected are: [52 53]\n", "Fold #3 - features selected are: [49 53]\n", "Fold #4 - features selected are: [49 53]\n" ] } ] }, { "cell_type": "markdown", "source": [ "We see that the feature 53 is always selected while the other feature varies depending on the cross-validation fold.\n", "\n", "If we would like to keep our score with similar generalization performance, **we could choose another metric to perform the test or select more features.** For instance, we could select the number of features based on a specific percentile of the highest scores." ], "metadata": { "id": "RgTtgRS5hBBJ" } }, { "cell_type": "markdown", "source": [ "#### Mutual information" ], "metadata": { "id": "HiK_yc8kk4GI" } }, { "cell_type": "markdown", "source": [ "The [*Automobile*](https://www.kaggle.com/toramky/automobile-dataset) dataset consists of 193 cars from the 1985 model year. The goal for this dataset is to predict a car's `price` (the target) from 23 of the car's features, such as `make`, `body_style`, and `horsepower`. In this example, we'll rank the features with mutual information and investigate the results by data visualization. (The original dataset requires data cleaning, you could refer to https://skill-lync.com/student-projects/project-1-1299)" ], "metadata": { "id": "k-V69-wYlCxA" } }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"autos.csv\")\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 352 }, "id": "lg40S0_0llto", "outputId": "d7663ccb-de86-4e56-b1d5-d4c0e5287c48" }, "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " symboling make fuel_type aspiration num_of_doors body_style \\\n", "0 3 alfa-romero gas std 2 convertible \n", "1 3 alfa-romero gas std 2 convertible \n", "2 1 alfa-romero gas std 2 hatchback \n", "3 2 audi gas std 4 sedan \n", "4 2 audi gas std 4 sedan \n", "\n", " drive_wheels engine_location wheel_base length ... engine_size \\\n", "0 rwd front 88.6 168.8 ... 130 \n", "1 rwd front 88.6 168.8 ... 130 \n", "2 rwd front 94.5 171.2 ... 152 \n", "3 fwd front 99.8 176.6 ... 109 \n", "4 4wd front 99.4 176.6 ... 136 \n", "\n", " fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n", "0 mpfi 3.47 2.68 9 111 5000 21 \n", "1 mpfi 3.47 2.68 9 111 5000 21 \n", "2 mpfi 2.68 3.47 9 154 5000 19 \n", "3 mpfi 3.19 3.40 10 102 5500 24 \n", "4 mpfi 3.19 3.40 8 115 5500 18 \n", "\n", " highway_mpg price \n", "0 27 13495 \n", "1 27 16500 \n", "2 26 16500 \n", "3 30 13950 \n", "4 22 17450 \n", "\n", "[5 rows x 25 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
symbolingmakefuel_typeaspirationnum_of_doorsbody_styledrive_wheelsengine_locationwheel_baselength...engine_sizefuel_systemborestrokecompression_ratiohorsepowerpeak_rpmcity_mpghighway_mpgprice
03alfa-romerogasstd2convertiblerwdfront88.6168.8...130mpfi3.472.6891115000212713495
13alfa-romerogasstd2convertiblerwdfront88.6168.8...130mpfi3.472.6891115000212716500
21alfa-romerogasstd2hatchbackrwdfront94.5171.2...152mpfi2.683.4791545000192616500
32audigasstd4sedanfwdfront99.8176.6...109mpfi3.193.40101025500243013950
42audigasstd4sedan4wdfront99.4176.6...136mpfi3.193.4081155500182217450
\n", "

5 rows × 25 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "markdown", "source": [ "The scikit-learn algorithm for MI treats discrete features differently from continuous features. Consequently, you need to tell it which are which. As a rule of thumb, anything that *must* have a `float` dtype is *not* discrete. Categoricals (`object` or `categorial` dtype) can be treated as discrete by giving them a label encoding" ], "metadata": { "id": "YX3VJQKJl4hf" } }, { "cell_type": "code", "source": [ "X = df.copy()\n", "y = X.pop(\"price\")\n", "\n", "# Label encoding for categoricals\n", "for colname in X.select_dtypes(\"object\"):\n", " X[colname], _ = X[colname].factorize()\n", "\n", "# All discrete features should now have integer dtypes (double-check this before using MI!)\n", "discrete_features = X.dtypes == int" ], "metadata": { "id": "VOVazcEFlv6Q" }, "execution_count": 34, "outputs": [] }, { "cell_type": "markdown", "source": [ "Scikit-learn has two mutual information metrics in its `feature_selection` module: one for real-valued targets (`mutual_info_regression`) and one for categorical targets (`mutual_info_classif`). Our target, `price`, is real-valued. The next cell computes the MI scores for our features and wraps them up in a nice dataframe." ], "metadata": { "id": "SgQfNAYNmGhw" } }, { "cell_type": "code", "source": [ "def make_mi_scores(X, y, discrete_features):\n", " mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)\n", " mi_scores = pd.Series(mi_scores, name=\"MI Scores\", index=X.columns)\n", " mi_scores = mi_scores.sort_values(ascending=False)\n", " return mi_scores\n", "\n", "mi_scores = make_mi_scores(X, y, discrete_features)\n", "mi_scores[::3] # show a few features with their MI scores" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "36ZK4lJNmDZi", "outputId": "b67fcb48-2df8-4f98-f0c9-397a177f06f5" }, "execution_count": 35, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "curb_weight 1.552832\n", "highway_mpg 0.959290\n", "length 0.615354\n", "bore 0.504682\n", "stroke 0.391373\n", "num_of_cylinders 0.330589\n", "compression_ratio 0.134892\n", "fuel_type 0.047279\n", "Name: MI Scores, dtype: float64" ] }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "code", "source": [ "def plot_mi_scores(scores):\n", " scores = scores.sort_values(ascending=True)\n", " width = np.arange(len(scores))\n", " ticks = list(scores.index)\n", " plt.barh(width, scores)\n", " plt.yticks(width, ticks)\n", " plt.title(\"Mutual Information Scores\")\n", "\n", "\n", "plt.figure(dpi=100, figsize=(8, 5))\n", "plot_mi_scores(mi_scores)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 460 }, "id": "Vi9tYOlEmKvL", "outputId": "c741f89c-0c60-4f4d-fa84-d02500098898" }, "execution_count": 36, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "# As we might expect, the high-scoring `curb_weight` feature exhibits a strong relationship with `price`, the target.\n", "sns.relplot(x=\"curb_weight\", y=\"price\", data=df)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 387 }, "id": "zDs3GY-Fo8zW", "outputId": "e4157100-6d61-4295-efbe-512029b07e61" }, "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 37 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "The `fuel_type` feature has a fairly low MI score, but as we can see from the figure, it clearly separates two `price` populations with different trends within the `horsepower` feature. **This indicates that `fuel_type` contributes an interaction effect and might not be unimportant after all.** Before deciding a feature is unimportant from its MI score, it's good to investigate any possible interaction effects -- domain knowledge can offer a lot of guidance here." ], "metadata": { "id": "A34J5fAZpNJO" } }, { "cell_type": "code", "source": [ "sns.lmplot(x=\"horsepower\", y=\"price\", hue=\"fuel_type\", data=df);" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 369 }, "id": "TCwOeNdepGUP", "outputId": "7255762f-7168-44a9-cd38-6c578861b4c1" }, "execution_count": 38, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "### Sequential feature selection" ], "metadata": { "id": "o0rWGFi9pvq4" } }, { "cell_type": "markdown", "source": [ "Sequential Feature Selection is available in the `SequentialFeatureSelector` transformer. SFS can be either forward or backward:\n", "\n", "* Forward-SFS is a greedy procedure that iteratively finds the best new feature to add to the set of selected features. Concretely, we initially start with zero feature and find the one feature that maximizes a cross-validated score when an estimator is trained on this single feature. Once that first feature is selected, we repeat the procedure by adding a new feature to the set of selected features. The procedure stops when the desired number of selected features is reached, as determined by the n_features_to_select parameter.\n", "\n", "* Backward-SFS follows the same idea but works in the opposite direction: instead of starting with no feature and greedily adding features, we start with all the features and greedily remove features from the set. The direction parameter controls whether forward or backward SFS is used.\n", "\n", "In general, forward and backward selection do not yield equivalent results. Also, one may be much faster than the other depending on the requested number of selected features: if we have 10 features and ask for 7 selected features, forward selection would need to perform 7 iterations while backward selection would only need to perform 3." ], "metadata": { "id": "wAQBoFEVp1X1" } }, { "cell_type": "code", "source": [ "X, y = load_iris(return_X_y=True)\n", "knn = KNeighborsClassifier(n_neighbors=3)" ], "metadata": { "id": "XN-VAbTRpV_I" }, "execution_count": 41, "outputs": [] }, { "cell_type": "code", "source": [ "sfs = SequentialFeatureSelector(knn, n_features_to_select=3)\n", "sfs.fit(X, y)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FWjzoHnk-Orv", "outputId": "2d761371-31d3-4f4b-ed23-a9b2c9e865e7" }, "execution_count": 42, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),\n", " n_features_to_select=3)" ] }, "metadata": {}, "execution_count": 42 } ] }, { "cell_type": "code", "source": [ "sfs.transform(X).shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8pYRsVKX-nk8", "outputId": "06c63564-28e1-4912-9a03-e99e64660ed7" }, "execution_count": 43, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(150, 3)" ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "markdown", "source": [ "### Feature selection from model" ], "metadata": { "id": "Wkh5pr70-sCN" } }, { "cell_type": "markdown", "source": [ "`SelectFromModel` is a meta-transformer that can be used alongside any estimator that assigns importance to each feature through a specific attribute (such as `coef_`, `feature_importances_`) or via an `importance_getter` callable after fitting. The features are considered unimportant and removed if the corresponding importance of the feature values are below the provided threshold parameter. \n", "\n", "Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument. Available heuristics are “mean”, “median” and float multiples of these like “0.1*mean”. In combination with the threshold criteria, one can use the `max_features` parameter to set a limit on the number of features to select." ], "metadata": { "id": "cAf7ARk0Howp" } }, { "cell_type": "code", "source": [ "X, y = load_iris(return_X_y=True)" ], "metadata": { "id": "gkWQgTpd-pwt" }, "execution_count": 45, "outputs": [] }, { "cell_type": "code", "source": [ "lsvc = LinearSVC(C=0.01, penalty=\"l1\", dual=False).fit(X, y)" ], "metadata": { "id": "NvnykC0WJC4_" }, "execution_count": 48, "outputs": [] }, { "cell_type": "code", "source": [ "model = SelectFromModel(lsvc, prefit=True)" ], "metadata": { "id": "jsBkMzF4JFAb" }, "execution_count": 49, "outputs": [] }, { "cell_type": "code", "source": [ "X_new = model.transform(X)\n", "X_new.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WbuJtKY2JFRg", "outputId": "d9739206-61be-4a02-dec1-4e5dc3957a57" }, "execution_count": 50, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(150, 3)" ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "markdown", "source": [ "### A Concret example" ], "metadata": { "id": "3zP7dVWRLgmn" } }, { "cell_type": "markdown", "source": [ "The following dataset is a record of neighborhoods in California district, predicting the median house value (target) given some information about the neighborhoods, as the average number of rooms, the latitude, the longitude or the median income of people in the neighborhoods (block)." ], "metadata": { "id": "k8srd8jgJZ94" } }, { "cell_type": "code", "source": [ "X, y = fetch_california_housing(as_frame=True, return_X_y=True)" ], "metadata": { "id": "qT8aM1mqJN3J" }, "execution_count": 52, "outputs": [] }, { "cell_type": "code", "source": [ "# To speed up the computation, we take the first 10000 samples\n", "X = X[:10000]\n", "y = y[:10000]\n", "X.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "nUIwkfs4JlWO", "outputId": "ad99d5a2-70ed-47c2-d5be-11aa243806b2" }, "execution_count": 53, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", "\n", " Longitude \n", "0 -122.23 \n", "1 -122.22 \n", "2 -122.24 \n", "3 -122.25 \n", "4 -122.25 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 53 } ] }, { "cell_type": "markdown", "source": [ "The feature reads as follow:\n", "\n", "* MedInc: median income in block\n", "* HouseAge: median house age in block\n", "* AveRooms: average number of rooms\n", "* AveBedrms: average number of bedrooms\n", "* Population: block population\n", "* AveOccup: average house occupancy\n", "* Latitude: house block latitude\n", "* Longitude: house block longitude\n", "* MedHouseVal: Median house value in 100k$ (target)\n", "\n", "To assert the quality of our inspection technique, let’s add some random feature that won’t help the prediction (un-informative feature)" ], "metadata": { "id": "8i7IQeucJsWY" } }, { "cell_type": "code", "source": [ "# Adding random features\n", "rng = np.random.RandomState(0)\n", "bin_var = pd.Series(rng.randint(0, 1, X.shape[0]), name='rnd_bin')\n", "num_var = pd.Series(np.arange(X.shape[0]), name='rnd_num')\n", "X_with_rnd_feat = pd.concat((X, bin_var, num_var), axis=1)" ], "metadata": { "id": "SLi9SoU6JqWe" }, "execution_count": 54, "outputs": [] }, { "cell_type": "code", "source": [ "X_train, X_test, y_train, y_test = train_test_split(X_with_rnd_feat, y, random_state=42)" ], "metadata": { "id": "OjVGGwhmJz91" }, "execution_count": 57, "outputs": [] }, { "cell_type": "markdown", "source": [ "In linear models, the target value is modeled as a linear combination of the features." ], "metadata": { "id": "m8Byb20-KF5I" } }, { "cell_type": "code", "source": [ "model = RidgeCV()\n", "\n", "model.fit(X_train, y_train)\n", "\n", "print(f'model score on training data: {model.score(X_train, y_train)}')\n", "print(f'model score on testing data: {model.score(X_test, y_test)}')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CVQ3s5uuJ83F", "outputId": "829c716a-43ca-4701-d474-d7bc862f15c8" }, "execution_count": 60, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "model score on training data: 0.6048814128047645\n", "model score on testing data: 0.5866391379089506\n" ] } ] }, { "cell_type": "markdown", "source": [ "Our linear model obtains a $R^2$ score of .60, so it explains a significant part of the target. Its coefficient should be somehow relevant. Let’s look at the coefficient learnt" ], "metadata": { "id": "41HBpjFYKe4a" } }, { "cell_type": "code", "source": [ "coefs = pd.DataFrame(\n", " model.coef_,\n", " columns=['Coefficients'], index=X_train.columns\n", ")\n", "\n", "coefs.plot(kind='barh', figsize=(9, 7))\n", "plt.title('Ridge model')\n", "plt.axvline(x=0, color='.5')\n", "plt.subplots_adjust(left=.3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 444 }, "id": "tdWavBbTKZxa", "outputId": "5f300833-73c9-463b-9666-95a1863b9925" }, "execution_count": 61, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "The `AveBedrms` have the higher coefficient. However, we can’t compare the magnitude of these coefficients directly, since they are not scaled. Indeed, `Population` is an integer which can be thousands, while `AveBedrms` is around 4 and `Latitude` is in degree.\n", "\n", "So the Population coefficient is expressed in `“100k$/habitant”` while the `AveBedrms` is expressed in `“100k$/nb of bedrooms”` and the Latitude coefficient in `“100k$/degree”`. We see that changing population by one does not change the outcome, while as we go south (latitude increase) the price becomes cheaper. Also, adding a bedroom (keeping all other feature constant) shall rise the price of the house by `80k$`.\n", "\n", "So looking at the coefficient plot to gauge feature importance can be misleading as some of them vary on a small scale, while others vary a lot more, several decades. So before any interpretation, we need to scale each column (removing the mean and scaling the variance to 1)." ], "metadata": { "id": "BEwY2u2PKrvR" } }, { "cell_type": "code", "source": [ "model = make_pipeline(StandardScaler(), RidgeCV())\n", "\n", "model.fit(X_train, y_train)\n", "\n", "print(f'model score on training data: {model.score(X_train, y_train)}')\n", "print(f'model score on testing data: {model.score(X_test, y_test)}')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hk5ebi7xKlJ0", "outputId": "e3182ee2-6c9c-4ddf-b935-a545c423df65" }, "execution_count": 65, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "model score on training data: 0.6048511948222112\n", "model score on testing data: 0.5863381274564599\n" ] } ] }, { "cell_type": "code", "source": [ "coefs = pd.DataFrame(\n", " model[1].coef_,\n", " columns=['Coefficients'], index=X_train.columns\n", ")\n", "\n", "coefs.plot(kind='barh', figsize=(9, 7))\n", "plt.title('Ridge model')\n", "plt.axvline(x=0, color='.5')\n", "plt.subplots_adjust(left=.3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 444 }, "id": "8KjhPtSzLP3P", "outputId": "9288292a-4415-4585-c361-4c75afa51828" }, "execution_count": 66, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "Now that the coefficients have been scaled, we can safely compare them. The median income feature, with longitude and latitude are the three variables that most influence the model.\n", "\n", "The plot above tells us about dependencies between a specific feature and the target when all other features remain constant, i.e., conditional dependencies. An increase of the `HouseAge` will induce an increase of the price when all other features remain constant. On the contrary, an increase of the average rooms will induce an decrease of the price when all other features remain constant." ], "metadata": { "id": "NehlVHi2LWHY" } }, { "cell_type": "markdown", "source": [ "We can check the coefficient variability through cross-validation: it is a form of data perturbation." ], "metadata": { "id": "ux2-205wLmSh" } }, { "cell_type": "code", "source": [ "cv_model = cross_validate(\n", " model, X_with_rnd_feat, y, cv=RepeatedKFold(n_splits=5, n_repeats=5),\n", " return_estimator=True, n_jobs=2\n", ")\n", "coefs = pd.DataFrame(\n", " [model[1].coef_\n", " for model in cv_model['estimator']],\n", " columns=X_with_rnd_feat.columns\n", ")\n", "plt.figure(figsize=(9, 7))\n", "sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)\n", "plt.axvline(x=0, color='.5')\n", "plt.xlabel('Coefficient importance')\n", "plt.title('Coefficient importance and its variability')\n", "plt.subplots_adjust(left=.3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "deiQJiPNLR1_", "outputId": "e4238395-ff2a-4a94-a018-ba1fea30d4e4" }, "execution_count": 64, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "Now if we want to select the four features which are the most important according to the coefficients. The `SelectFromModel` is meant just for that. `SelectFromModel` accepts a `threshold` parameter and will select the features whose importance (defined by the coefficients) are above this threshold." ], "metadata": { "id": "wL8szAkwMLto" } }, { "cell_type": "code", "source": [ "model" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "T8C_GOhgM1L4", "outputId": "6512cc7e-be65-4d42-ee28-52c26e00aadb" }, "execution_count": 67, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('ridgecv', RidgeCV(alphas=array([ 0.1, 1. , 10. ])))])" ] }, "metadata": {}, "execution_count": 67 } ] }, { "cell_type": "code", "source": [ "importance = np.abs(model[1].coef_)\n", "threshold = np.sort(importance)[-5] + 0.01" ], "metadata": { "id": "JrrrUqq4Lo2S" }, "execution_count": 68, "outputs": [] }, { "cell_type": "code", "source": [ "feature_names = np.array(X.columns)\n", "sfm = SelectFromModel(model[1], threshold=threshold).fit(X, y)\n", "print(f\"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nqtT14UoM4OS", "outputId": "603fb5b4-c3b5-4175-9079-8c8370c9f942" }, "execution_count": 72, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Features selected by SelectFromModel: ['MedInc' 'AveBedrms' 'Latitude' 'Longitude']\n" ] } ] }, { "cell_type": "markdown", "source": [ "#### Linear models with sparse coefficients (Lasso)" ], "metadata": { "id": "25OIEBwCNZUw" } }, { "cell_type": "markdown", "source": [ "In it important to keep in mind that the associations extracted depend on the model. To illustrate this point we consider a Lasso model, that performs feature selection with a L1 penalty. Let us fit a Lasso model with a strong regularization parameters alpha" ], "metadata": { "id": "vHsNnqoQNhsT" } }, { "cell_type": "code", "source": [ "model = make_pipeline(StandardScaler(), Lasso(alpha=.015))\n", "\n", "model.fit(X_train, y_train)\n", "\n", "print(f'model score on training data: {model.score(X_train, y_train)}')\n", "print(f'model score on testing data: {model.score(X_test, y_test)}')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LWLjWFkmM9LL", "outputId": "e3de2fb1-32ed-4ab2-bb9d-8fd673368fb7" }, "execution_count": 73, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "model score on training data: 0.5933235371761756\n", "model score on testing data: 0.5673786563118284\n" ] } ] }, { "cell_type": "code", "source": [ "coefs = pd.DataFrame(\n", " model[1].coef_,\n", " columns=['Coefficients'], index=X_train.columns\n", ")\n", "\n", "coefs.plot(kind='barh', figsize=(9, 7))\n", "plt.title('Lasso model, strong regularization')\n", "plt.axvline(x=0, color='.5')\n", "plt.subplots_adjust(left=.3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 444 }, "id": "LTDzbcc6NlYv", "outputId": "3f8a1cd1-2eab-4560-c2b2-394f0a196389" }, "execution_count": 74, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "Here the model score is a bit lower, because of the strong regularization. However, it has zeroed out 3 coefficients, selecting a small number of variables to make its prediction." ], "metadata": { "id": "eCstXIgINwgh" } }, { "cell_type": "markdown", "source": [ "#### Randomforest with feature importance" ], "metadata": { "id": "9xppGbQ4N1Zw" } }, { "cell_type": "markdown", "source": [ "On some algorithms, there are some feature importance methods, inherently built within the model. It is the case in RandomForest models. Let’s investigate the built-in feature_importances_ attribute." ], "metadata": { "id": "EZi4Yk_hN935" } }, { "cell_type": "code", "source": [ "model = RandomForestRegressor()\n", "\n", "model.fit(X_train, y_train)\n", "\n", "print(f'model score on training data: {model.score(X_train, y_train)}')\n", "print(f'model score on testing data: {model.score(X_test, y_test)}')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0BZSUHpLNn5X", "outputId": "4b1440c3-908d-4d09-9138-71862efa6378" }, "execution_count": 75, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "model score on training data: 0.9796271614609334\n", "model score on testing data: 0.8457060700865664\n" ] } ] }, { "cell_type": "code", "source": [ "importances = model.feature_importances_\n", "indices = np.argsort(importances)\n", "\n", "fig, ax = plt.subplots()\n", "ax.barh(range(len(importances)), importances[indices])\n", "ax.set_yticks(range(len(importances)))\n", "_ = ax.set_yticklabels(np.array(X_train.columns)[indices])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 265 }, "id": "xBAIYWWFOCcy", "outputId": "dd7b532f-70d7-4f43-8958-2427e1cc7ff7" }, "execution_count": 76, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "Median income is still the most important feature. It also has a small bias toward high cardinality features, such as the noisy feature `rnd_num`, which are here predicted having `0.07` importance, more than `HouseAge` (which has low cardinality)." ], "metadata": { "id": "XbR4LmgUOLUA" } }, { "cell_type": "markdown", "source": [ "#### Feature importance by permutation" ], "metadata": { "id": "NiOjvqYDOVhR" } }, { "cell_type": "markdown", "source": [ "We introduce here a new technique to evaluate the feature importance of any given fitted model. It basically shuffles a feature and sees how the model changes its prediction. Thus, the change in prediction will correspond to the feature importance." ], "metadata": { "id": "HGPNltcJOYlo" } }, { "cell_type": "code", "source": [ "# Any model could be used here\n", "\n", "\n", "model = RandomForestRegressor()\n", "model.fit(X_train, y_train)\n", "\n", "print(f'model score on training data: {model.score(X_train, y_train)}')\n", "print(f'model score on testing data: {model.score(X_test, y_test)}')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "W20bLMRSOJJq", "outputId": "478b016a-fc0f-4e4a-93a5-d1ff38504db5" }, "execution_count": 77, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "model score on training data: 0.9795237577232964\n", "model score on testing data: 0.8467958072484991\n" ] } ] }, { "cell_type": "code", "source": [ "r = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)" ], "metadata": { "id": "5J9M7E1vOmMN" }, "execution_count": 79, "outputs": [] }, { "cell_type": "code", "source": [ "fig, ax = plt.subplots()\n", "\n", "indices = r.importances_mean.argsort()\n", "plt.barh(range(len(indices)), r.importances_mean[indices], xerr=r.importances_std[indices])\n", "\n", "ax.set_yticks(range(len(indices)))\n", "_ = ax.set_yticklabels(np.array(X_train.columns)[indices])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 265 }, "id": "gNU5FFTsPToC", "outputId": "098861ca-aedf-4d99-902a-c9cb205b0eaa" }, "execution_count": 80, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "We see again that the feature `MedInc`, Latitude and Longitude are very important for the model. We note that our random variable `rnd_num` is now very less important than latitude. Indeed, the feature importance built-in in `RandomForest` has bias for continuous data, such as `AveOccup` and `rnd_num`." ], "metadata": { "id": "ZSzzW-1OQL0Y" } }, { "cell_type": "markdown", "source": [ "#### Feature rejection using Boruta" ], "metadata": { "id": "I1T6QCk1QpF_" } }, { "cell_type": "code", "source": [ "# define Boruta feature selection method\n", "feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)" ], "metadata": { "id": "El1azRgZQohX" }, "execution_count": 85, "outputs": [] }, { "cell_type": "code", "source": [ "# find all relevant features \n", "feat_selector.fit(X_train.values, y_train.values)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zlVL9xjvP3de", "outputId": "ba1dfec2-31cd-4947-807d-820b967ebdba" }, "execution_count": 89, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Iteration: \t1 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t2 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t3 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t4 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t5 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t6 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t7 / 100\n", "Confirmed: \t0\n", "Tentative: \t10\n", "Rejected: \t0\n", "Iteration: \t8 / 100\n", "Confirmed: \t9\n", "Tentative: \t0\n", "Rejected: \t1\n", "\n", "\n", "BorutaPy finished running.\n", "\n", "Iteration: \t9 / 100\n", "Confirmed: \t9\n", "Tentative: \t0\n", "Rejected: \t1\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "BorutaPy(estimator=RandomForestRegressor(n_estimators=44,\n", " random_state=RandomState(MT19937) at 0x7F0639E28E20),\n", " n_estimators='auto',\n", " random_state=RandomState(MT19937) at 0x7F0639E28E20, verbose=2)" ] }, "metadata": {}, "execution_count": 89 } ] }, { "cell_type": "code", "source": [ "# check selected features \n", "np.array(X_train.columns)[feat_selector.support_]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "V5aQGB1cRWYY", "outputId": "fdfd1412-4330-41fb-8288-558ac0186f1b" }, "execution_count": 92, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population',\n", " 'AveOccup', 'Latitude', 'Longitude', 'rnd_num'], dtype=object)" ] }, "metadata": {}, "execution_count": 92 } ] }, { "cell_type": "code", "source": [ "# check ranking of features\n", "feat_selector.ranking_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WaQ2itMTRX99", "outputId": "3b0688dc-ff55-4104-d169-154a8a71a7d8" }, "execution_count": 91, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1])" ] }, "metadata": {}, "execution_count": 91 } ] }, { "cell_type": "code", "source": [ "# call transform() on X to filter it down to selected features\n", "X_filtered = feat_selector.transform(X_train.values)\n", "X_filtered.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UGaMazEIRb74", "outputId": "1c97eacc-b0c8-465f-a603-9f526d9f559b" }, "execution_count": 95, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(7500, 9)" ] }, "metadata": {}, "execution_count": 95 } ] }, { "cell_type": "markdown", "source": [ "## Dimensional reduction" ], "metadata": { "id": "tILVdZOoUA8f" } }, { "cell_type": "markdown", "source": [ "We now looked at our model-based method for feature engineering: principal component analysis (PCA). You could think of PCA as a partitioning of the variation in the data. PCA is a great tool to help you discover important relationships in the data and can also be used to create more informative features." ], "metadata": { "id": "GOuAQ5oUUPTP" } }, { "cell_type": "markdown", "source": [ "There are two ways you could use PCA for feature engineering.\n", "\n", "The first way is to use it as a descriptive technique. Since the components tell you about the variation, **you could compute the MI scores for the components and see what kind of variation is most predictive of your target.** That could give you ideas for kinds of features to create -- a product of `'Height'` and `'Diameter'` if `'Size'` is important, say, or a ratio of `'Height'` and `'Diameter'` if `Shape` is important. You could even try clustering on one or more of the high-scoring components.\n", "\n", "The second way is to use the components themselves as features. Because the components expose the variational structure of the data directly, **they can often be more informative than the original features.** Here are some use-cases:\n", "- **Dimensionality reduction**: When your features are highly redundant (*multicollinear*, specifically), PCA will partition out the redundancy into one or more near-zero variance components, which you can then drop since they will contain little or no information.\n", "- **Anomaly detection**: Unusual variation, not apparent from the original features, will often show up in the low-variance components. These components could be highly informative in an anomaly or outlier detection task.\n", "- **Noise reduction**: A collection of sensor readings will often share some common background noise. PCA can sometimes collect the (informative) signal into a smaller number of features while leaving the noise alone, thus boosting the signal-to-noise ratio.\n", "- **Decorrelation**: Some ML algorithms struggle with highly-correlated features. PCA transforms correlated features into uncorrelated components, which could be easier for your algorithm to work with." ], "metadata": { "id": "x1CYu0lXU4vG" } }, { "cell_type": "markdown", "source": [ "PCA basically gives you direct access to the correlational structure of your data. You'll no doubt come up with applications of your own!" ], "metadata": { "id": "FBYABylCVU9_" } }, { "cell_type": "code", "source": [ "def plot_variance(pca, width=8, dpi=100):\n", " # Create figure\n", " fig, axs = plt.subplots(1, 2)\n", " n = pca.n_components_\n", " grid = np.arange(1, n + 1)\n", " # Explained variance\n", " evr = pca.explained_variance_ratio_\n", " axs[0].bar(grid, evr)\n", " axs[0].set(\n", " xlabel=\"Component\", title=\"% Explained Variance\", ylim=(0.0, 1.0)\n", " )\n", " # Cumulative Variance\n", " cv = np.cumsum(evr)\n", " axs[1].plot(np.r_[0, grid], np.r_[0, cv], \"o-\")\n", " axs[1].set(\n", " xlabel=\"Component\", title=\"% Cumulative Variance\", ylim=(0.0, 1.0)\n", " )\n", " # Set up figure\n", " fig.set(figwidth=8, dpi=100)\n", " return axs" ], "metadata": { "id": "y4WY_DqCUEsx" }, "execution_count": 97, "outputs": [] }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"autos.csv\")\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 352 }, "id": "KY-9l7p1VtRM", "outputId": "d3ba83cc-7903-4504-e080-6393ab5e91a5" }, "execution_count": 98, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " symboling make fuel_type aspiration num_of_doors body_style \\\n", "0 3 alfa-romero gas std 2 convertible \n", "1 3 alfa-romero gas std 2 convertible \n", "2 1 alfa-romero gas std 2 hatchback \n", "3 2 audi gas std 4 sedan \n", "4 2 audi gas std 4 sedan \n", "\n", " drive_wheels engine_location wheel_base length ... engine_size \\\n", "0 rwd front 88.6 168.8 ... 130 \n", "1 rwd front 88.6 168.8 ... 130 \n", "2 rwd front 94.5 171.2 ... 152 \n", "3 fwd front 99.8 176.6 ... 109 \n", "4 4wd front 99.4 176.6 ... 136 \n", "\n", " fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n", "0 mpfi 3.47 2.68 9 111 5000 21 \n", "1 mpfi 3.47 2.68 9 111 5000 21 \n", "2 mpfi 2.68 3.47 9 154 5000 19 \n", "3 mpfi 3.19 3.40 10 102 5500 24 \n", "4 mpfi 3.19 3.40 8 115 5500 18 \n", "\n", " highway_mpg price \n", "0 27 13495 \n", "1 27 16500 \n", "2 26 16500 \n", "3 30 13950 \n", "4 22 17450 \n", "\n", "[5 rows x 25 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
symbolingmakefuel_typeaspirationnum_of_doorsbody_styledrive_wheelsengine_locationwheel_baselength...engine_sizefuel_systemborestrokecompression_ratiohorsepowerpeak_rpmcity_mpghighway_mpgprice
03alfa-romerogasstd2convertiblerwdfront88.6168.8...130mpfi3.472.6891115000212713495
13alfa-romerogasstd2convertiblerwdfront88.6168.8...130mpfi3.472.6891115000212716500
21alfa-romerogasstd2hatchbackrwdfront94.5171.2...152mpfi2.683.4791545000192616500
32audigasstd4sedanfwdfront99.8176.6...109mpfi3.193.40101025500243013950
42audigasstd4sedan4wdfront99.4176.6...136mpfi3.193.4081155500182217450
\n", "

5 rows × 25 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 98 } ] }, { "cell_type": "markdown", "source": [ "We've selected four features that cover a range of properties. Each of these features also has a high MI score with the target, `price`. We'll standardize the data since these features aren't naturally on the same scale." ], "metadata": { "id": "PFIztg2qV1oA" } }, { "cell_type": "code", "source": [ "features = [\"highway_mpg\", \"engine_size\", \"horsepower\", \"curb_weight\"]\n", "\n", "X = df.copy()\n", "y = X.pop('price')\n", "X = X.loc[:, features]\n", "\n", "# Standardize\n", "X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)" ], "metadata": { "id": "idZp66RfVwXt" }, "execution_count": 99, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now we can fit scikit-learn's `PCA` estimator and create the principal components. You can see here the first few rows of the transformed dataset." ], "metadata": { "id": "c3wTqeTBV4tg" } }, { "cell_type": "code", "source": [ "# Create principal components\n", "pca = PCA()\n", "X_pca = pca.fit_transform(X_scaled)\n", "\n", "# Convert to dataframe\n", "component_names = [f\"PC{i+1}\" for i in range(X_pca.shape[1])]\n", "X_pca = pd.DataFrame(X_pca, columns=component_names)\n", "\n", "X_pca.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "6JTcIZ7DV3J-", "outputId": "c1e85fe9-e901-4a0a-9fc4-7786feb38168" }, "execution_count": 100, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " PC1 PC2 PC3 PC4\n", "0 0.382486 -0.400222 0.124122 0.169539\n", "1 0.382486 -0.400222 0.124122 0.169539\n", "2 1.550890 -0.107175 0.598361 -0.256081\n", "3 -0.408859 -0.425947 0.243335 0.013920\n", "4 1.132749 -0.814565 -0.202885 0.224138" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PC1PC2PC3PC4
00.382486-0.4002220.1241220.169539
10.382486-0.4002220.1241220.169539
21.550890-0.1071750.598361-0.256081
3-0.408859-0.4259470.2433350.013920
41.132749-0.814565-0.2028850.224138
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 100 } ] }, { "cell_type": "markdown", "source": [ "After fitting, the `PCA` instance contains the loadings in its `components_` attribute. We'll wrap the loadings up in a dataframe." ], "metadata": { "id": "UEnzS398V-Cv" } }, { "cell_type": "code", "source": [ "loadings = pd.DataFrame(\n", " pca.components_.T, # transpose the matrix of loadings\n", " columns=component_names, # so the columns are the principal components\n", " index=X.columns, # and the rows are the original features\n", ")\n", "loadings" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 175 }, "id": "78T7AguDV7aO", "outputId": "c2d11267-3440-4edb-9c55-1a8fa5837784" }, "execution_count": 101, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " PC1 PC2 PC3 PC4\n", "highway_mpg -0.492347 0.770892 0.070142 -0.397996\n", "engine_size 0.503859 0.626709 0.019960 0.594107\n", "horsepower 0.500448 0.013788 0.731093 -0.463534\n", "curb_weight 0.503262 0.113008 -0.678369 -0.523232" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PC1PC2PC3PC4
highway_mpg-0.4923470.7708920.070142-0.397996
engine_size0.5038590.6267090.0199600.594107
horsepower0.5004480.0137880.731093-0.463534
curb_weight0.5032620.113008-0.678369-0.523232
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 101 } ] }, { "cell_type": "markdown", "source": [ "Recall that the signs and magnitudes of a component's loadings tell us what kind of variation it's captured. The first component (`PC1`) shows a contrast between large, powerful vehicles with poor gas milage, and smaller, more economical vehicles with good gas milage. We might call this the \"Luxury/Economy\" axis. The next figure shows that our four chosen features mostly vary along the Luxury/Economy axis." ], "metadata": { "id": "oAx9iIwKWKrv" } }, { "cell_type": "code", "source": [ "# Look at explained variance\n", "plot_variance(pca)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 456 }, "id": "rEHb4338WIUA", "outputId": "d8d16d16-af1c-45a9-f70d-09b382af4ff5" }, "execution_count": 102, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([,\n", " ],\n", " dtype=object)" ] }, "metadata": {}, "execution_count": 102 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "Let's also look at the MI scores of the components. Not surprisingly, `PC1` is highly informative, though the remaining components, despite their small variance, still have a significant relationship with `price`. Examining those components could be worthwhile to find relationships not captured by the main Luxury/Economy axis." ], "metadata": { "id": "tEdlKVpvWRo2" } }, { "cell_type": "code", "source": [ "mi_scores = make_mi_scores(X_pca, y, discrete_features=False)\n", "mi_scores" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L_EhGRNuWPfg", "outputId": "ba784c35-4b6b-42d9-e4a9-7d3a0b052ed8" }, "execution_count": 103, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "PC1 1.013190\n", "PC2 0.379271\n", "PC3 0.306780\n", "PC4 0.204163\n", "Name: MI Scores, dtype: float64" ] }, "metadata": {}, "execution_count": 103 } ] }, { "cell_type": "markdown", "source": [ "The third component shows a contrast between `horsepower` and `curb_weight` -- sports cars vs. wagons, it seems." ], "metadata": { "id": "XX88JTAsWY9f" } }, { "cell_type": "code", "source": [ "# Show dataframe sorted by PC3\n", "idx = X_pca[\"PC3\"].sort_values(ascending=False).index\n", "cols = [\"make\", \"body_style\", \"horsepower\", \"curb_weight\"]\n", "df.loc[idx, cols]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "NhtyKfvHWUNY", "outputId": "6ddde196-0e17-46cd-962c-e0fb0db78116" }, "execution_count": 104, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " make body_style horsepower curb_weight\n", "118 porsche hardtop 207 2756\n", "117 porsche hardtop 207 2756\n", "119 porsche convertible 207 2800\n", "45 jaguar sedan 262 3950\n", "96 nissan hatchback 200 3139\n", ".. ... ... ... ...\n", "59 mercedes-benz wagon 123 3750\n", "61 mercedes-benz sedan 123 3770\n", "101 peugot wagon 95 3430\n", "105 peugot wagon 95 3485\n", "143 toyota wagon 62 3110\n", "\n", "[193 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makebody_stylehorsepowercurb_weight
118porschehardtop2072756
117porschehardtop2072756
119porscheconvertible2072800
45jaguarsedan2623950
96nissanhatchback2003139
...............
59mercedes-benzwagon1233750
61mercedes-benzsedan1233770
101peugotwagon953430
105peugotwagon953485
143toyotawagon623110
\n", "

193 rows × 4 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 104 } ] }, { "cell_type": "markdown", "source": [ "To express this contrast, let's create a new ratio feature:" ], "metadata": { "id": "kMd55KXSWdsQ" } }, { "cell_type": "code", "source": [ "df[\"sports_or_wagon\"] = X.curb_weight / X.horsepower\n", "sns.regplot(x=\"sports_or_wagon\", y='price', data=df, order=2);" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 280 }, "id": "xY-2Px6jWaMZ", "outputId": "72bbc399-144c-4db0-ffc6-ddf09957299e" }, "execution_count": 105, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "## Manifold learning" ], "metadata": { "id": "hXbbcntiYDL2" } }, { "cell_type": "markdown", "source": [ "#### t-SNE" ], "metadata": { "id": "-6QWgk8taZK1" } }, { "cell_type": "code", "source": [ "digits = load_digits()" ], "metadata": { "id": "Gs1pLCRYWe9x" }, "execution_count": 111, "outputs": [] }, { "cell_type": "code", "source": [ "X = digits.images.reshape(-1, digits.images.shape[1]*digits.images.shape[2])\n", "X.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p-Bk3gJxYG4E", "outputId": "4a75ff2a-fb10-459a-ece5-5b05a3515e4c" }, "execution_count": 112, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(1797, 64)" ] }, "metadata": {}, "execution_count": 112 } ] }, { "cell_type": "code", "source": [ "fig, ax_array = plt.subplots(5, 5)\n", "axes = ax_array.flatten()\n", "for i, ax in enumerate(axes):\n", " ax.imshow(digits.images[i], cmap='gray_r')\n", "plt.setp(axes, xticks=[], yticks=[], frame_on=False)\n", "plt.tight_layout(h_pad=0.5, w_pad=0.01)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "TdLBnz6tYKwF", "outputId": "fb0b1c55-ec19-4c49-8a2a-0d3ac581721e" }, "execution_count": 113, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEYCAYAAACnYrZxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASHElEQVR4nO3dsVIcxxbG8d5bN1/wCyDkB0BIzhFVKIZETjEJhIgIMiATkSAkAlIIBLFcJTa3jHkAC3gBo30CHFy76gb9na1ujdDMt/9feFa7s7RmT03VfHO69/DwkAAAXv7zvb8AAKB5NHcAMERzBwBDNHcAMERzBwBD/x3xelGU5uzsTL62ubmZrb969Spbf/v2bbY+OTlZ8pX+1at501doLIL08uXLbP3Lly/Z+u7ubra+uLhYc/jOrtvl5WW2vrS0lK0/e/as6HNGaPW67e3tyde2tray9enp6Wz906dP2fq4/U7V7/GXX37J1s/Pz5s6dEpi3bhyBwBDNHcAMERzBwBDNHcAMERzBwBDo9IyRVQiJqWUbm5usvX7+/ts/YcffsjWT09P5TFev34dfLtumpiYyNYHg0G2/vHjx2y9Mi3Tan/88Yd8bX5+Plvv9/vZ+u3tbRNfqVVU8iX6DR0eHmbra2tr2bpKyywsLIz4dl6Oj4+zdZXCegxcuQOAIZo7ABiiuQOAIZo7ABiiuQOAoaq0jLpDrhIxKaX0559/ZutPnz7N1tXMGXXslLqblolSH6WzTb7n3fnHFs3nmJmZydbVbBk1k6fLVldXs/Uo1fbixYtsXc2WGadUjJofk5JOy7x58yZbr0lnPXnypOjfc+UOAIZo7gBgiOYOAIZo7gBgiOYOAIZo7gBgqCoKqYZ9PX/+XL5HRR4VFcnqsv39/Wx9Z2dHvmc4HBYdQ23L50jFzFLSsTH1HsfBauo39/nzZ/keFWdWkUfVCyq32Ws1FXdMSUcb1TZ76jxUgwJTivtEDlfuAGCI5g4AhmjuAGCI5g4AhmjuAGCo0bSMGvbV5DG6fBde3SFXd9RTKv97o+FGXaX+JpU+SikeKpYTJSHcRMm1v/76K1tXaRlV//XXX+Ux2v4bvri4yNY3Njbke5aXl4uOcXBwkK0fHR0VfU6EK3cAMERzBwBDNHcAMERzBwBDNHcAMFSVllF3u6Mt8BSVivntt9+y9Z9//rn4GONEbdnX5e331EwNlTiIqBRNNNNjnKjftkq/rK2tZet7e3vyGG/fvi3/Yo+o3+8X1VNK6eTkJFuPttDMUdtA1uDKHQAM0dwBwBDNHQAM0dwBwBDNHQAMVaVl1GwKlXBJKaWzs7OiurK5uVn079F9avbO5eWlfM/19XW2rtIIaiemlZUVeYyu7t60tbUlXyvdcenDhw/ZepdTbWo3s2huk0rFqM9Ss2iaTG1x5Q4AhmjuAGCI5g4AhmjuAGCI5g4AhmjuAGCo0ShkNCxIRRh/+umnbL1mCFlXRfEnFbdTW4GpeGC0lV/bqaFn0VAm9ZoaQqbW88mTJ/IYXY1CRtvcra6uFn2WijweHh4WfU7Xqd/wcDjM1h/j98iVOwAYorkDgCGaOwAYorkDgCGaOwAY6j08PHzv7wAAaBhX7gBgiOYOAIZo7gBgiOYOAIZGjR8outuqdh1JST/GfXx8XHKIWr3HOMj/aewudemuMNEj+RVavW77+/vyNbU+5+fn2brauanf78tj3N7eZusTExOtXrc3b97I19T6qMfl1WdV7ijU6nVTu3ilpM+3aLewBmXXjSt3ADBEcwcAQzR3ADBEcwcAQ6OeUC264RDNvr67uyv5qDQ1NZWtq5tYI7T6Ro2aJZ6Svomzvb2drat55ZVavW7RDVVFzYZXn6VulKUU3ixr9bpFwYfS35f6zVfeSGzFuqk1mJ6ebuzAMzMz2XplIIIbqgAwLmjuAGCI5g4AhmjuAGCI5g4AhkaNHygSPXKs0jLq8e7Sx+5HHb/NVPIlEj0KPS6ix+gVlSZSCYlHenz8UanEUErlY0LUby5atyit0wZRj1Hm5uay9YbTREW4cgcAQzR3ADBEcwcAQzR3ADBEcwcAQzR3ADDUaBQyGhymdroZDofZuoprdTXuGImiV2rAUBRnc6NiYzVxstJhY2pnopT07kRtF33v2dnZbD3YdSpbj3pB29V8d3WeqMhyTdyyFFfuAGCI5g4AhmjuAGCI5g4AhmjuAGCo0bRMlCxQyQa1rdTGxkbx8WsGSbVBdOdc3blXqQ91d94xvRBtSVaapFHnbtuHXNWoSWoMBoNs/ebmJlvv8vmmEkAquZZSSpOTk9n6+vp6tq7O3Wibw9I15codAAzR3AHAEM0dAAzR3AHAEM0dAAw1mpaJNJU6iO4md1V0F1ylFFTiQaWMrq6u5DHaPqdGrU+Uzur1ekXvcUzFqETG/Py8fI/a8lH97lQ6K/q/6WqSJkpnqddKf1tR4i9a0xyu3AHAEM0dAAzR3AHAEM0dAAzR3AHAUKNpmYuLC/lav9/P1nd2doqOoe7Od1m0M45Kv6jEgUo1RHfa256WUaJkgTrf5ubmvtXXaR11jqi1SUmvqTqv1M5Nx8fH8hilv/kuUL8htZ5qfUoTMRGu3AHAEM0dAAzR3AHAEM0dAAzR3AHAEM0dAAw1GoX8+PGjfO3g4KDos5aXl7N1xwFPURRSRdBUlEqtj2OENNpK7+TkJFtXW6g5Un9r9BtS28Wp+OTi4mK23tUtLyPR36QGh6kBf+rcbTKWzJU7ABiiuQOAIZo7ABiiuQOAIZo7ABjqPTw8fO/vAABoGFfuAGCI5g4AhmjuAGCI5g4AhkaNHyi626oetU1J775S+hh95U4lvZo3fYVvfpda7bKjHjmPHtUPHslv9bpFO3+9e/cuW1fnT8NjCVqxbmp0xf7+vvwg9XtU66PGWkQjNYJH7FuxbjVUf1Nrrf5vKs/D7Lpx5Q4AhmjuAGCI5g4AhmjuAGCo0Xnu0U0UdfNre3s7W1c3dlR91PG7Sq3b3d1dUT262d3VGedq5n9K+m9S54/j/HF10y66ua7WQZ0/ap+G6Jxqcmb5Y4p+Q+q8UsGHmmOU/k65cgcAQzR3ADBEcwcAQzR3ADBEcwcAQ1VpGXUXPnocXCUb1GO76q6x2mXc1fr6etG/n5uby9ZL79p3QfQ3qUSIelzeMS2jRnhEvyGV+lC/036/n62rde6y6BxR/UqNu1Dnrvo/iz5L4codAAzR3AHAEM0dAAzR3AHAEM0dAAzR3AHAUFUUsmbQVOlQr64Os4qouFQUsVKDwMaJit5GA6jU+aM+C/9TGrdTscouR2/V7kknJyfyPWrnL7UOw+EwW29yqBpX7gBgiOYOAIZo7gBgiOYOAIZo7gBgqCotM27Du5qikhpRgmNqaipbVymarm5hFlGJAzXMKlK6DaFjaiuikiLqvFJJr9LUTZvUJKrUwDW1nsrs7GzxsRWu3AHAEM0dAAzR3AHAEM0dAAzR3AHAUO/h4SF6PfuiShZMTk7KD1J3z9W2cGoWTZSQCJIiPfmmbyNc1BJq60K1jZna9kz9n43Q2XVT6QWV7qhcH6Wz66aUzveJ0jLBVnKtWLeaGVDq71UzZFQKrnL2UXbduHIHAEM0dwAwRHMHAEM0dwAwRHMHAEON7sSkki8p6Z1K3r9/X3QMx9kpEZV+UcZpFkqUXjg4OMjW1Xqqz4rWUyW62rILkUp9DAYD+Z77+/tsXc1IUWmQLu94pf7PVQIrpfIEYZAYagxX7gBgiOYOAIZo7gBgiOYOAIZo7gBgiOYOAIaqopBKNCxIRc3Uln1R7GicqOjnzMxMtn59fZ2tR4OxuhqfVFHElJobdBWtjYqztT0KqWLJNRYXF7P16P/GkepvKnr7GOvDlTsAGKK5A4AhmjsAGKK5A4AhmjsAGBq1zR4AoIO4cgcAQzR3ADBEcwcAQzR3ADA0avxA9m7rxcVF9h9HjzWrR6HV4/LKzc2NfC147LtXdJCv19hd6p2dnWxd7YyjHruvHDHQ6nWLRiqo9VH1paWlbL1yDEar1y169F2NZ1DroEYwqHUeodXrFp0L6ndaum6VsuvGlTsAGKK5A4AhmjsAGKK5A4AhmjsAGKrarOPo6ChbHwwG8j1qaP329na23vaNEB7L5eVltq7SL13deCOiNnSp2axDrY9aZ0fROaLWWr1HpUHU5hUpdfc3HG1GdHd3l60/Ulomiyt3ADBEcwcAQzR3ADBEcwcAQzR3ADBUlZZR8yfUnfboPequumPqQ4nWTSWQojk+blQSQZ1TKZUnbFS6xpGao5OSngmjEi7qd9rVREyk5nw7OTnJ1tUsmibXjSt3ADBEcwcAQzR3ADBEcwcAQzR3ADBEcwcAQ1VRSEVF1qLXVLxonKJpURRSieJsbhYXF7P1qakp+R61FaQa/qTWMzoPuxr3iyJ9at2Wl5ez9cptCDspGoamBs+pc0R9VjScrBRX7gBgiOYOAIZo7gBgiOYOAIZo7gBgqCoto+701qQHVlZWar6ClS9fvhS/Z3p6OlufmZnJ1nd3d+VnqTRK283Ozjb2WWrAU5SW6erWfFHSSp0/auDaOA34i/7W0nNB/R/UDF9UuHIHAEM0dwAwRHMHAEM0dwAwRHMHAEO9h4eH6PXwxRJqZoW6a3x1dZWtl94x/kev5k1foWjdorvww+EwW19fXy/6QtHMiiAR0op1U2kitSVcSjq9oP5WlQaJkiXBudiKdVNevnwpXyudvdNwYqjV69Yk1Q+Pjo7ke4LfcHbduHIHAEM0dwAwRHMHAEM0dwAwRHMHAENVs2VUemEwGMj3qJ1c1CyLylRMJ0WJg9Idl9Tcn4ODA/kelSBpy05DKk20s7Mj36PSL+rcjT6rq9TfGv2/qveM085oSjQDqnQ3tZubm2xdpWhSKv+dcuUOAIZo7gBgiOYOAIZo7gBgiOYOAIZo7gBgqCoKqWI/Ku6Ykh6AFQ20GhdR7FNFG1V0T0Ueo6302hJ5bJKKrUVDs9yoCGkU6VPrE0Vpx8Xx8bF8bWNjo+izVAQ8+p2WbmnIlTsAGKK5A4AhmjsAGKK5A4AhmjsAGBq1zR4AoIO4cgcAQzR3ADBEcwcAQzR3ADA0avxA0d3WaDcStTOOevRdPQatHscfoVfzpq/Q2F1qtfvK9PR00eeonV9SCscPtHrdot2Tdnd3s3U17iJ67LtCK9bt/v4++4/39vbkB3348CFb//3337N19Uj82dmZPMbCwoJ6qRXr1iTVx9Qog8pRINl148odAAzR3AHAEM0dAAzR3AHA0KgnVItuOCwtLcnXLi4uSj5KcrwxGLm8vMzW5+fniz6ny+tWM5td/U3qPGz4Se1WrNunT5+y/3hra0t+0IsXL4oOrG7ARtT3Si1ZtxrqBqm66a/CJ6Uz2//BDVUAGBc0dwAwRHMHAEM0dwAwRHMHAEOjxg9kqQRHlIhZX1/P1tXd5GfPnpV+rdZTqY9obEP0iH3O3Nxctl75WHMrqARBdI6o9IJ6j/o/6PJ5qJIvNQmXz58/Z+unp6fZ+traWvEx2k71vZRSWllZydbfvXuXre/v72frpb/3CFfuAGCI5g4AhmjuAGCI5g4AhmjuAGCI5g4AhqqikDVU9Ee5u7v7Rt/k+1HxvI2Njcf9IiaiXblU7PT6+jpb73JUtEkq8vjjjz9m68+fP8/WV1dXG/tObRGdbyrqrd7T6+VnpEXnodrNTuHKHQAM0dwBwBDNHQAM0dwBwBDNHQAMNbrNnroDnFJK9/f32boaCqW2UIu2VguG7rR6+65ocJhKGZ2cnGTrU1NT2frt7W3JV/pXq9ctolIKaq2joVAVOrtuytOnT7P1vb29bP3169c1h2nFuqlUmxoOllJKy8vL2bpKbTW83SPb7AHAuKC5A4AhmjsAGKK5A4AhmjsAGGp0tky/35evqSSLSoOou8yOM0CirdxK/17H9VGilJFKPKiUEWKvXr3K1jc3N7P1yrRMK6gZLpOTk/I979+/z9YrU2qN4ModAAzR3AHAEM0dAAzR3AHAEM0dAAw1mpaJdgo5Pz8v+iyVlllaWir6nK4rTb8MBoNsPbpr39WETc25cHV1VVSPjhGlnNpMzYNJSc+AOj09zdbV79TR4uJi8Ws1c2qawpU7ABiiuQOAIZo7ABiiuQOAIZo7ABiiuQOAoUa32YvidiomqaJUaqBYtM1eoBXbd9VQa6rWUw3TiqKowZq2et3UOZJSefRWiaJ+amu+iYmJVq9bNABL/b0LCwvZ+uHhYbautuUbodXrVqN0u9Bgq9AI2+wBwLiguQOAIZo7ABiiuQOAIZo7ABgalZYBAHQQV+4AYIjmDgCGaO4AYIjmDgCGaO4AYIjmDgCG/gZBh5f5FuY3VAAAAABJRU5ErkJggg==\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "otsne = oTSNE(\n", " n_components=2,\n", " perplexity=30,\n", " initialization='pca', \n", " n_jobs=2,\n", " random_state=0,\n", " negative_gradient_method='auto', \n", " verbose=True,\n", ")" ], "metadata": { "id": "PBOWpizzYL6X" }, "execution_count": 114, "outputs": [] }, { "cell_type": "code", "source": [ "embedding = otsne.fit(X)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "os4KFGwKYVyF", "outputId": "357f45eb-60da-4ab7-9b25-0da15b954b26" }, "execution_count": 115, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--------------------------------------------------------------------------------\n", "TSNE(n_jobs=2, random_state=0, verbose=True)\n", "--------------------------------------------------------------------------------\n", "===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...\n", " --> Time elapsed: 0.92 seconds\n", "===> Calculating affinity matrix...\n", " --> Time elapsed: 0.11 seconds\n", "===> Calculating PCA-based initialization...\n", " --> Time elapsed: 0.02 seconds\n", "===> Running optimization with exaggeration=12.00, lr=200.00 for 250 iterations...\n", "Iteration 50, KL divergence 3.1044, 50 iterations in 0.5826 sec\n", "Iteration 100, KL divergence 2.7273, 50 iterations in 0.4728 sec\n", "Iteration 150, KL divergence 2.6609, 50 iterations in 0.4515 sec\n", "Iteration 200, KL divergence 2.6357, 50 iterations in 0.4789 sec\n", "Iteration 250, KL divergence 2.6227, 50 iterations in 0.4703 sec\n", " --> Time elapsed: 2.46 seconds\n", "===> Running optimization with exaggeration=1.00, lr=200.00 for 500 iterations...\n", "Iteration 50, KL divergence 1.2848, 50 iterations in 0.4899 sec\n", "Iteration 100, KL divergence 1.0173, 50 iterations in 0.4494 sec\n", "Iteration 150, KL divergence 0.9110, 50 iterations in 0.4788 sec\n", "Iteration 200, KL divergence 0.8550, 50 iterations in 0.4884 sec\n", "Iteration 250, KL divergence 0.8216, 50 iterations in 0.4819 sec\n", "Iteration 300, KL divergence 0.8011, 50 iterations in 0.4725 sec\n", "Iteration 350, KL divergence 0.7875, 50 iterations in 0.4661 sec\n", "Iteration 400, KL divergence 0.7790, 50 iterations in 0.4494 sec\n", "Iteration 450, KL divergence 0.7735, 50 iterations in 0.4512 sec\n", "Iteration 500, KL divergence 0.7699, 50 iterations in 0.4846 sec\n", " --> Time elapsed: 4.72 seconds\n" ] } ] }, { "cell_type": "code", "source": [ "plt.figure(figsize=(10, 8))\n", "plt.scatter(embedding[:, 0], embedding[:, 1], c=digits.target, cmap='Spectral', s=5)\n", "plt.gca().set_aspect('equal', 'datalim')\n", "plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))\n", "plt.title('tSNE of the Digits dataset', fontsize=24)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 525 }, "id": "lsVmVDwUYXnu", "outputId": "9fc9155e-fcdd-4cf1-9537-caba4d0defe8" }, "execution_count": 116, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Text(0.5, 1.0, 'tSNE of the Digits dataset')" ] }, "metadata": {}, "execution_count": 116 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "#### UMAP" ], "metadata": { "id": "gJIG1-zcacTO" } }, { "cell_type": "markdown", "source": [ "UMAP is useful for generating visualisations, but if you want to make use of UMAP more generally for machine learning tasks it is important to be be able to train a model and then later pass new data to the model and have it transform that data into the learned space. For example if we use UMAP to learn a latent space and then train a classifier on data transformed into the latent space then the classifier is only useful for prediction if we can transform data for which we want a prediction into the latent space the classifier uses. " ], "metadata": { "id": "RFco1ZdqawkV" } }, { "cell_type": "code", "source": [ "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, stratify=digits.target, random_state=42)" ], "metadata": { "id": "Xd89JSOraYKk" }, "execution_count": 117, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now to get a benchmark idea of what we are looking at let’s train a couple of different classifiers and then see how well they score on the test set. For this example let’s try a support vector classifier and a KNN classifier." ], "metadata": { "id": "4MoEzfIIbGh_" } }, { "cell_type": "code", "source": [ "svc = SVC(gamma='auto').fit(X_train, y_train)\n", "knn = KNeighborsClassifier().fit(X_train, y_train)\n", "svc.score(X_test, y_test), knn.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Iv6WyDIRa__H", "outputId": "817ceab4-11cd-44b4-de28-91f2a6d38c57" }, "execution_count": 121, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(0.62, 0.9844444444444445)" ] }, "metadata": {}, "execution_count": 121 } ] }, { "cell_type": "markdown", "source": [ "The goal now is to make use of UMAP as a preprocessing step that one could potentially fit into a pipeline. " ], "metadata": { "id": "Gx02oiB3b8nV" } }, { "cell_type": "code", "source": [ "trans = umap.UMAP(n_neighbors=5, random_state=42).fit(X_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SU4gnbqrbM9h", "outputId": "47e1956e-22ad-42cc-8906-69725e5f773f" }, "execution_count": 122, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/numba/np/ufunc/parallel.py:363: NumbaWarning: The TBB threading layer requires TBB version 2019.5 or later i.e., TBB_INTERFACE_VERSION >= 11005. Found TBB_INTERFACE_VERSION = 9107. The TBB threading layer is disabled.\n", " warnings.warn(problem)\n" ] } ] }, { "cell_type": "code", "source": [ "plt.figure(figsize=(10, 8))\n", "plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], c=y_train, cmap='Spectral', s=5)\n", "plt.gca().set_aspect('equal', 'datalim')\n", "plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))\n", "plt.title('Umap of the Digits dataset', fontsize=24)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 525 }, "id": "vKQdE0_LcFMG", "outputId": "6e5d02a2-03c0-47e0-9291-829be88b7baf" }, "execution_count": 125, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Text(0.5, 1.0, 'Umap of the Digits dataset')" ] }, "metadata": {}, "execution_count": 125 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "This looks very promising! Most of the classes got very cleanly separated, and that gives us some hope that it could help with classifier performance. We can now train some new models (again an SVC and a KNN classifier) on the embedded training data. This looks exactly as before but now we pass it the embedded data. " ], "metadata": { "id": "NVGIGUqCcY-r" } }, { "cell_type": "code", "source": [ "svc = SVC(gamma='auto').fit(trans.embedding_, y_train)\n", "knn = KNeighborsClassifier().fit(trans.embedding_, y_train)" ], "metadata": { "id": "vaiBl0iycI3W" }, "execution_count": 131, "outputs": [] }, { "cell_type": "code", "source": [ "test_embedding = trans.transform(X_test)" ], "metadata": { "id": "g1_0phAhchBB" }, "execution_count": 128, "outputs": [] }, { "cell_type": "markdown", "source": [ "The next important question is what the transform did to our test data. In principle we have a new two dimensional representation of the test-set, and ideally this should be based on the existing embedding of the training set" ], "metadata": { "id": "yqqhDql-c0M9" } }, { "cell_type": "code", "source": [ "plt.figure(figsize=(10, 8))\n", "plt.scatter(test_embedding[:, 0], test_embedding[:, 1], c=y_test, cmap='Spectral', s=5)\n", "plt.gca().set_aspect('equal', 'datalim')\n", "plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))\n", "plt.title('Umap of the Digits dataset', fontsize=24)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 525 }, "id": "gblKRyguczF7", "outputId": "b8edc9f9-21aa-4d60-b5c5-9260820bf94e" }, "execution_count": 130, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Text(0.5, 1.0, 'Umap of the Digits dataset')" ] }, "metadata": {}, "execution_count": 130 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "The results look like what we should expect; the test data has been embedded into two dimensions in exactly the locations we should expect (by class) given the embedding of the training data visualised above. This means we can now try out models that were trained on the embedded training data by handing them the newly transformed test set." ], "metadata": { "id": "LI9PGw5CdBFH" } }, { "cell_type": "code", "source": [ "svc.score(trans.transform(X_test), y_test), knn.score(trans.transform(X_test), y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "__808eXkc38M", "outputId": "960e5116-027b-4929-de7b-a88e6ef8337c" }, "execution_count": 132, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(0.9822222222222222, 0.9822222222222222)" ] }, "metadata": {}, "execution_count": 132 } ] }, { "cell_type": "markdown", "source": [ "The results are pretty good. While the accuracy of the KNN classifier did not improve there was not a lot of scope for improvement given the data. On the other hand the SVC has improved to have equal accuracy to the KNN classifier!\n", "\n", "For more interesting datasets the larger dimensional embedding might have been a significant gain – it is certainly worth exploring as one of the parameters in a grid search across a pipeline that includes UMAP.\n", "\n" ], "metadata": { "id": "1GN4oP95dSGe" } }, { "cell_type": "markdown", "source": [ "## Clustering" ], "metadata": { "id": "KqWbd1rSdync" } }, { "cell_type": "markdown", "source": [ "When used for feature engineering, we could attempt to discover groups of customers representing a market segment, for instance, or geographic areas that share similar weather patterns. Adding a feature of cluster labels can help machine learning models untangle complicated relationships of space or proximity." ], "metadata": { "id": "Gwnv4I6BesNV" } }, { "cell_type": "markdown", "source": [ "### Cluster Labels as a feature" ], "metadata": { "id": "WErqxr32gLk1" } }, { "cell_type": "markdown", "source": [ "Applied to a single real-valued feature, clustering acts like a traditional \"binning\" or \"discretization\" transform. On multiple features, it's like \"multi-dimensional binning\" (sometimes called vector quantization)." ], "metadata": { "id": "neHcqLiFgQfw" } }, { "cell_type": "markdown", "source": [ "It's important to remember that this Cluster feature is categorical. Here, it's shown with a label encoding (that is, as a sequence of integers) as a typical clustering algorithm would produce; depending on your model, a one-hot encoding may be more appropriate.\n", "\n", "The motivating idea **for adding cluster labels is that the clusters will break up complicated relationships across features into simpler chunks**. Our model can then just learn the simpler chunks one-by-one instead having to learn the complicated whole all at once. It's a \"divide and conquer\" strategy." ], "metadata": { "id": "igCdboqEgZGm" } }, { "cell_type": "markdown", "source": [ "As spatial features, [*California Housing*](https://www.kaggle.com/camnugent/california-housing-prices)'s `'Latitude'` and `'Longitude'` make natural candidates for k-means clustering. In this example we'll cluster these with `'MedInc'` (median income) to create economic segments in different regions of California." ], "metadata": { "id": "rxryA26wfL5-" } }, { "cell_type": "markdown", "source": [ "Since k-means clustering is sensitive to scale, it can be a good idea rescale or normalize data with extreme values. Our features are already roughly on the same scale, so we'll leave them as-is." ], "metadata": { "id": "4WNdYtx3fHYl" } }, { "cell_type": "code", "source": [ "df = fetch_california_housing(as_frame=True)['frame']" ], "metadata": { "id": "mfv9WpC_dGs9" }, "execution_count": 136, "outputs": [] }, { "cell_type": "code", "source": [ "X = df.loc[:, [\"MedInc\", \"Latitude\", \"Longitude\"]]\n", "X.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "n-kpl0jAfTF7", "outputId": "665d6557-c7d7-452f-956a-d5a9210dc570" }, "execution_count": 138, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MedInc Latitude Longitude\n", "0 8.3252 37.88 -122.23\n", "1 8.3014 37.86 -122.22\n", "2 7.2574 37.85 -122.24\n", "3 5.6431 37.85 -122.25\n", "4 3.8462 37.85 -122.25" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncLatitudeLongitude
08.325237.88-122.23
18.301437.86-122.22
27.257437.85-122.24
35.643137.85-122.25
43.846237.85-122.25
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 138 } ] }, { "cell_type": "code", "source": [ "# Create cluster feature\n", "kmeans = KMeans(n_clusters=6)\n", "X[\"Cluster\"] = kmeans.fit_predict(X)\n", "X[\"Cluster\"] = X[\"Cluster\"].astype(\"category\")\n", "\n", "X.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "iavDcVrRfUzD", "outputId": "2873b3a9-478c-4b68-b458-202b72c729e4" }, "execution_count": 141, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MedInc Latitude Longitude Cluster\n", "0 8.3252 37.88 -122.23 0\n", "1 8.3014 37.86 -122.22 0\n", "2 7.2574 37.85 -122.24 0\n", "3 5.6431 37.85 -122.25 0\n", "4 3.8462 37.85 -122.25 3" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncLatitudeLongitudeCluster
08.325237.88-122.230
18.301437.86-122.220
27.257437.85-122.240
35.643137.85-122.250
43.846237.85-122.253
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 141 } ] }, { "cell_type": "markdown", "source": [ "Notice the differnece between `predict` and `transform` in the KMeans. `predict` will predict the closest cluster each sample in X belongs to. `transform` will transform data to a cluster-distance space where each dimension is the distance to the cluster centers.\n" ], "metadata": { "id": "vtsRG70AguME" } }, { "cell_type": "markdown", "source": [ "Now let's look at a couple plots to see how effective this was. First, a scatter plot that shows the geographic distribution of the clusters. It seems like the algorithm has created separate segments for higher-income areas on the coasts." ], "metadata": { "id": "eUmP_ZxCflXO" } }, { "cell_type": "code", "source": [ "sns.relplot(\n", " x=\"Longitude\", y=\"Latitude\", hue=\"Cluster\", data=X, height=6,\n", ");" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 441 }, "id": "6iwt9GcFfgmT", "outputId": "dbe380ba-7855-4964-825c-4f0f1790673e" }, "execution_count": 142, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "The target in this dataset is `MedHouseVal` (median house value). These box-plots show the distribution of the target within each cluster. If the clustering is informative, these distributions should, for the most part, separate across `MedHouseVal`, which is indeed what we see." ], "metadata": { "id": "J_w0SDVRfsDP" } }, { "cell_type": "code", "source": [ "X[\"MedHouseVal\"] = df[\"MedHouseVal\"]\n", "sns.catplot(x=\"MedHouseVal\", y=\"Cluster\", data=X, kind=\"boxen\", height=6);" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 441 }, "id": "BhTEVdW2fo3m", "outputId": "42fbcb32-f26e-4d74-c420-6da76ea85e70" }, "execution_count": 143, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "### Cluster distance as a feature" ], "metadata": { "id": "hLGtkXQ0h_Fz" } }, { "cell_type": "code", "source": [ "X_digits, y_digits = load_digits(return_X_y=True)\n", "X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)" ], "metadata": { "id": "hWGVLhQ0iXFs" }, "execution_count": 144, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now let's fit a Logistic Regression model and evaluate it on the test set:" ], "metadata": { "id": "RzK45mJ0ifGF" } }, { "cell_type": "code", "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", max_iter=5000, random_state=42)\n", "log_reg.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GWYzRRBRidDF", "outputId": "f93ff00f-3d10-48dc-862a-38392bf62cce" }, "execution_count": 147, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression(max_iter=5000, multi_class='ovr', random_state=42)" ] }, "metadata": {}, "execution_count": 147 } ] }, { "cell_type": "code", "source": [ "log_reg_score = log_reg.score(X_test, y_test)\n", "log_reg_score" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f-VVacUdik0W", "outputId": "d3bf3fed-a304-43b7-c28e-553a89ef5628" }, "execution_count": 148, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9688888888888889" ] }, "metadata": {}, "execution_count": 148 } ] }, { "cell_type": "markdown", "source": [ "Okay, that's our baseline: 96.89% accuracy. Let's see if we can do better by using K-Means as a preprocessing step. **We will create a pipeline that will first cluster the training set into 50 clusters and replace the images with their distances to the 50 clusters**, then apply a logistic regression model:" ], "metadata": { "id": "MLbjHeZ6iqJl" } }, { "cell_type": "code", "source": [ "pipeline = Pipeline([\n", " (\"kmeans\", KMeans(n_clusters=50, random_state=42)),\n", " (\"log_reg\", LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", max_iter=5000, random_state=42)),\n", "])\n", "pipeline.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5WPkttz6iv0V", "outputId": "da49d07d-ce4d-4eda-d983-d83b0c7af52d" }, "execution_count": 153, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Pipeline(steps=[('kmeans', KMeans(n_clusters=50, random_state=42)),\n", " ('log_reg',\n", " LogisticRegression(max_iter=5000, multi_class='ovr',\n", " random_state=42))])" ] }, "metadata": {}, "execution_count": 153 } ] }, { "cell_type": "code", "source": [ "pipeline_score = pipeline.score(X_test, y_test)\n", "pipeline_score" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pamadM2-jJsl", "outputId": "57384f0f-fa7a-4f00-fb66-eabef8511313" }, "execution_count": 154, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9777777777777777" ] }, "metadata": {}, "execution_count": 154 } ] }, { "cell_type": "markdown", "source": [ "How much did the error rate drop?" ], "metadata": { "id": "G9o8gEHBjU5l" } }, { "cell_type": "code", "source": [ "1 - (1 - pipeline_score) / (1 - log_reg_score)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IH5gTeDgjS9m", "outputId": "16a6eeca-4644-484b-bdbe-eb229db5195f" }, "execution_count": 155, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.28571428571428414" ] }, "metadata": {}, "execution_count": 155 } ] }, { "cell_type": "markdown", "source": [ "How about that? We reduced the error rate by over 28%! But we chose the number of clusters k completely arbitrarily, we can surely do better. Since K-Means is just a preprocessing step in a classification pipeline, finding a good value for k is the best value of k is simply the one that results in the best classification performance." ], "metadata": { "id": "ZGJUnrTQjSFd" } }, { "cell_type": "code", "source": [ "param_grid = dict(kmeans__n_clusters=range(2, 20))\n", "grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)\n", "grid_clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CJcg77rljlms", "outputId": "591abfbc-8fd5-48ca-9123-ee1bceb033e2" }, "execution_count": 160, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fitting 3 folds for each of 18 candidates, totalling 54 fits\n", "[CV] END ...............................kmeans__n_clusters=2; total time= 0.3s\n", "[CV] END ...............................kmeans__n_clusters=2; total time= 0.4s\n", "[CV] END ...............................kmeans__n_clusters=2; total time= 0.3s\n", "[CV] END ...............................kmeans__n_clusters=3; total time= 0.4s\n", "[CV] END ...............................kmeans__n_clusters=3; total time= 0.4s\n", "[CV] END ...............................kmeans__n_clusters=3; total time= 0.3s\n", "[CV] END ...............................kmeans__n_clusters=4; total time= 0.3s\n", "[CV] END ...............................kmeans__n_clusters=4; total time= 0.4s\n", "[CV] END ...............................kmeans__n_clusters=4; total time= 0.4s\n", "[CV] END ...............................kmeans__n_clusters=5; total time= 0.6s\n", "[CV] END ...............................kmeans__n_clusters=5; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=5; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=6; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=6; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=6; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=7; total time= 0.8s\n", "[CV] END ...............................kmeans__n_clusters=7; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=7; total time= 0.5s\n", "[CV] END ...............................kmeans__n_clusters=8; total time= 0.6s\n", "[CV] END ...............................kmeans__n_clusters=8; total time= 0.8s\n", "[CV] END ...............................kmeans__n_clusters=8; total time= 0.7s\n", "[CV] END ...............................kmeans__n_clusters=9; total time= 0.8s\n", "[CV] END ...............................kmeans__n_clusters=9; total time= 0.7s\n", "[CV] END ...............................kmeans__n_clusters=9; total time= 0.7s\n", "[CV] END ..............................kmeans__n_clusters=10; total time= 0.8s\n", "[CV] END ..............................kmeans__n_clusters=10; total time= 1.0s\n", "[CV] END ..............................kmeans__n_clusters=10; total time= 0.9s\n", "[CV] END ..............................kmeans__n_clusters=11; total time= 1.4s\n", "[CV] END ..............................kmeans__n_clusters=11; total time= 1.5s\n", "[CV] END ..............................kmeans__n_clusters=11; total time= 1.3s\n", "[CV] END ..............................kmeans__n_clusters=12; total time= 1.4s\n", "[CV] END ..............................kmeans__n_clusters=12; total time= 1.6s\n", "[CV] END ..............................kmeans__n_clusters=12; total time= 1.8s\n", "[CV] END ..............................kmeans__n_clusters=13; total time= 2.0s\n", "[CV] END ..............................kmeans__n_clusters=13; total time= 1.7s\n", "[CV] END ..............................kmeans__n_clusters=13; total time= 1.9s\n", "[CV] END ..............................kmeans__n_clusters=14; total time= 2.0s\n", "[CV] END ..............................kmeans__n_clusters=14; total time= 2.2s\n", "[CV] END ..............................kmeans__n_clusters=14; total time= 2.1s\n", "[CV] END ..............................kmeans__n_clusters=15; total time= 2.5s\n", "[CV] END ..............................kmeans__n_clusters=15; total time= 2.2s\n", "[CV] END ..............................kmeans__n_clusters=15; total time= 2.3s\n", "[CV] END ..............................kmeans__n_clusters=16; total time= 2.8s\n", "[CV] END ..............................kmeans__n_clusters=16; total time= 2.9s\n", "[CV] END ..............................kmeans__n_clusters=16; total time= 2.8s\n", "[CV] END ..............................kmeans__n_clusters=17; total time= 3.1s\n", "[CV] END ..............................kmeans__n_clusters=17; total time= 2.9s\n", "[CV] END ..............................kmeans__n_clusters=17; total time= 2.9s\n", "[CV] END ..............................kmeans__n_clusters=18; total time= 3.3s\n", "[CV] END ..............................kmeans__n_clusters=18; total time= 3.1s\n", "[CV] END ..............................kmeans__n_clusters=18; total time= 2.9s\n", "[CV] END ..............................kmeans__n_clusters=19; total time= 3.0s\n", "[CV] END ..............................kmeans__n_clusters=19; total time= 3.1s\n", "[CV] END ..............................kmeans__n_clusters=19; total time= 3.2s\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('kmeans',\n", " KMeans(n_clusters=50, random_state=42)),\n", " ('log_reg',\n", " LogisticRegression(max_iter=5000,\n", " multi_class='ovr',\n", " random_state=42))]),\n", " param_grid={'kmeans__n_clusters': range(2, 20)}, verbose=2)" ] }, "metadata": {}, "execution_count": 160 } ] }, { "cell_type": "code", "source": [ "grid_clf.best_params_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9RmSYkxEjwYf", "outputId": "6a9715a4-9452-405b-e20c-c131f3d24abc" }, "execution_count": 161, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'kmeans__n_clusters': 18}" ] }, "metadata": {}, "execution_count": 161 } ] }, { "cell_type": "code", "source": [ "grid_clf.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "syMQxC39jwzJ", "outputId": "d4cec9d6-a09b-4820-8228-39651adb7e2a" }, "execution_count": 162, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.96" ] }, "metadata": {}, "execution_count": 162 } ] }, { "cell_type": "markdown", "source": [ "### Using Clustering for Semi-Supervised Learning" ], "metadata": { "id": "HkGriveijyLO" } }, { "cell_type": "markdown", "source": [ "Another use case for clustering is in semi-supervised learning, when we have plenty of unlabeled instances and very few labeled instances.\n", "\n", "Let's look at the performance of a logistic regression model when we only have 50 labeled instances:" ], "metadata": { "id": "RitVYWXYj5G8" } }, { "cell_type": "code", "source": [ "n_labeled = 50" ], "metadata": { "id": "vkci_970j2AW" }, "execution_count": 163, "outputs": [] }, { "cell_type": "code", "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])\n", "log_reg.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5g5AYlvJj7R2", "outputId": "27b70199-68d2-4867-a58c-b9af6dd4b62d" }, "execution_count": 164, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8333333333333334" ] }, "metadata": {}, "execution_count": 164 } ] }, { "cell_type": "markdown", "source": [ "It's much less than earlier of course. Let's see how we can do better. First, let's cluster the training set into 50 clusters, then for each cluster let's **find the image closest to the centroid. We will call these images the representative images:**" ], "metadata": { "id": "eRBErxUFkD4X" } }, { "cell_type": "code", "source": [ "k = 50\n", "kmeans = KMeans(n_clusters=k, random_state=42)\n", "X_digits_dist = kmeans.fit_transform(X_train)\n", "representative_digit_idx = np.argmin(X_digits_dist, axis=0)\n", "X_representative_digits = X_train[representative_digit_idx]" ], "metadata": { "id": "7xWyUhAzkEzH" }, "execution_count": 165, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now let's plot these representative images and label them manually:" ], "metadata": { "id": "SAO9pqHakPz-" } }, { "cell_type": "code", "source": [ "plt.figure(figsize=(8, 2))\n", "for index, X_representative_digit in enumerate(X_representative_digits):\n", " plt.subplot(k // 10, 10, index + 1)\n", " plt.imshow(X_representative_digit.reshape(8, 8), cmap=\"binary\", interpolation=\"bilinear\")\n", " plt.axis('off')\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 140 }, "id": "dcxow1HUkNr-", "outputId": "47081e97-eef0-45a7-f33f-a93ef53a36ef" }, "execution_count": 166, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "# Assuming we manually label these digits\n", "y_train[representative_digit_idx]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HyAAW_otkS1f", "outputId": "18c125b7-f1dc-4555-9f31-fef1513e5a19" }, "execution_count": 168, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([4, 8, 0, 6, 8, 3, 7, 7, 9, 2, 5, 5, 8, 5, 2, 1, 2, 9, 6, 1, 1, 6,\n", " 9, 0, 8, 3, 0, 7, 4, 1, 6, 5, 2, 4, 1, 8, 6, 3, 9, 2, 4, 2, 9, 4,\n", " 7, 6, 2, 3, 1, 1])" ] }, "metadata": {}, "execution_count": 168 } ] }, { "cell_type": "code", "source": [ "y_representative_digits = y_train[representative_digit_idx]" ], "metadata": { "id": "x948FHgfkZNO" }, "execution_count": 169, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now we have a dataset with just 50 labeled instances, but instead of being completely random instances, each of them is a representative image of its cluster. Let's see if the performance is any better:" ], "metadata": { "id": "cD9U11nHkdpG" } }, { "cell_type": "code", "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", max_iter=5000, random_state=42)\n", "log_reg.fit(X_representative_digits, y_representative_digits)\n", "log_reg.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KVjHsaOskZ7W", "outputId": "e7d51462-5fc3-497f-cc21-b8bd48267a62" }, "execution_count": 170, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9222222222222223" ] }, "metadata": {}, "execution_count": 170 } ] }, { "cell_type": "markdown", "source": [ "We jumped from 83.3% accuracy to 92.2%, although we are still only training the model on 50 instances. Since it's often costly and painful to label instances, especially when it has to be done manually by experts, it's a good idea to make them label representative instances rather than just random instances.\n", "\n", "But perhaps we can go one step further: **what if we propagated the labels to all the other instances in the same cluster?**" ], "metadata": { "id": "z0ZsKaR6kr0m" } }, { "cell_type": "code", "source": [ "y_train_propagated = np.empty(len(X_train), dtype=np.int32)\n", "for i in range(k):\n", " y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]" ], "metadata": { "id": "xcvp7wKKkyMu" }, "execution_count": 171, "outputs": [] }, { "cell_type": "code", "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", max_iter=5000, random_state=42)\n", "log_reg.fit(X_train, y_train_propagated)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OVrOBxjokz1H", "outputId": "d17fcb91-917b-4886-e047-e8dfc2e8dc03" }, "execution_count": 172, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression(max_iter=5000, multi_class='ovr', random_state=42)" ] }, "metadata": {}, "execution_count": 172 } ] }, { "cell_type": "code", "source": [ "log_reg.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wvjGUNwIk1lH", "outputId": "f965a33e-060e-49cf-bfd4-cb3a9a4513bf" }, "execution_count": 173, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9333333333333333" ] }, "metadata": {}, "execution_count": 173 } ] }, { "cell_type": "markdown", "source": [ "We got a tiny little accuracy boost. Better than nothing, **but we should probably have propagated the labels only to the instances closest to the centroid, because by propagating to the full cluster, we have certainly included some outliers.** Let's only propagate the labels to the 75th percentile closest to the centroid:" ], "metadata": { "id": "hYglrEiik27u" } }, { "cell_type": "code", "source": [ "percentile_closest = 75\n", "\n", "X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]\n", "for i in range(k):\n", " in_cluster = (kmeans.labels_ == i)\n", " cluster_dist = X_cluster_dist[in_cluster]\n", " cutoff_distance = np.percentile(cluster_dist, percentile_closest)\n", " above_cutoff = (X_cluster_dist > cutoff_distance)\n", " X_cluster_dist[in_cluster & above_cutoff] = -1" ], "metadata": { "id": "tGl67Mrgk-yn" }, "execution_count": 174, "outputs": [] }, { "cell_type": "code", "source": [ "partially_propagated = (X_cluster_dist != -1)\n", "X_train_partially_propagated = X_train[partially_propagated]\n", "y_train_partially_propagated = y_train_propagated[partially_propagated]" ], "metadata": { "id": "mP1asARqlAgP" }, "execution_count": 175, "outputs": [] }, { "cell_type": "code", "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", max_iter=5000, random_state=42)\n", "log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yPqX-hDclCUV", "outputId": "253e83d8-7925-4215-ac53-8b2167daef84" }, "execution_count": 176, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression(max_iter=5000, multi_class='ovr', random_state=42)" ] }, "metadata": {}, "execution_count": 176 } ] }, { "cell_type": "code", "source": [ "log_reg.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sM6SiaJSlDt2", "outputId": "a3a6affd-19d2-4dd0-f641-20ef5dad4977" }, "execution_count": 177, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9355555555555556" ] }, "metadata": {}, "execution_count": 177 } ] }, { "cell_type": "markdown", "source": [ "A bit better. With just 50 labeled instances (just 5 examples per class on average!), we got 93.5% performance, which is getting closer to the performance of logistic regression on the fully labeled digits dataset." ], "metadata": { "id": "Undkod_hlNpt" } }, { "cell_type": "code", "source": [ "np.mean(y_train_partially_propagated == y_train[partially_propagated])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Z31E9T9_leml", "outputId": "997c2543-ddc4-4728-db9a-1a2fbd3cf8ff" }, "execution_count": 178, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9750747756729811" ] }, "metadata": {}, "execution_count": 178 } ] }, { "cell_type": "markdown", "source": [ "You could also do a few iterations of active learning:\n", "\n", "1. Manually label the instances that the classifier is least sure about, if possible by picking them in distinct clusters.\n", "2. Train a new model with these additional labels." ], "metadata": { "id": "QqP69ECmlhJV" } }, { "cell_type": "markdown", "source": [ "### Feature agglomeration" ], "metadata": { "id": "9-bBPvHMgqFC" } }, { "cell_type": "markdown", "source": [ "`cluster.FeatureAgglomeration` applies Hierarchical clustering to group together features that behave similarly." ], "metadata": { "id": "SM_QtDIooF98" } }, { "cell_type": "code", "source": [ "X, y = load_iris(return_X_y=True)" ], "metadata": { "id": "-JqxixPEfyXO" }, "execution_count": 179, "outputs": [] }, { "cell_type": "code", "source": [ "#set n_clusters to 2, the output will be two columns of agglomerated features (iris has 4 features)\n", "agglo = FeatureAgglomeration(n_clusters=2).fit_transform(X)" ], "metadata": { "id": "9IvTKFMNnjiv" }, "execution_count": 182, "outputs": [] }, { "cell_type": "code", "source": [ "plt.scatter(agglo[:,0],agglo[:,1],c=y)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 265 }, "id": "1ag31qi9nmF5", "outputId": "398f8927-6346-4b1a-b94a-894c0b5f94c3" }, "execution_count": 184, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "6_wK8MRjn8Z8" }, "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" }, "nav_menu": {}, "toc": { "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 6, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false }, "colab": { "name": "11_DR_Clustering.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true } }, "nbformat": 4, "nbformat_minor": 0 }